# Load packages
library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(plotly)
# Read datasets/confirmed_cases_worldwide.csv into confirmed_cases_worldwide
coronavirus <- read_csv("./data/coronavirus.csv")
Parsed with column specification:
cols(
date = col_date(format = ""),
province = col_logical(),
country = col_character(),
lat = col_double(),
long = col_double(),
type = col_character(),
cases = col_double()
)
45800 parsing failures.
row col expected actual file
37001 province 1/0/T/F/TRUE/FALSE Alberta './data/coronavirus.csv'
37002 province 1/0/T/F/TRUE/FALSE Alberta './data/coronavirus.csv'
37003 province 1/0/T/F/TRUE/FALSE Alberta './data/coronavirus.csv'
37004 province 1/0/T/F/TRUE/FALSE Alberta './data/coronavirus.csv'
37005 province 1/0/T/F/TRUE/FALSE Alberta './data/coronavirus.csv'
..... ........ .................. ....... ........................
See problems(...) for more details.
head(coronavirus)
str(coronavirus)
tibble [157,000 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ date : Date[1:157000], format: "2020-01-22" "2020-01-23" "2020-01-24" ...
$ province: logi [1:157000] NA NA NA NA NA NA ...
$ country : chr [1:157000] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
$ lat : num [1:157000] 33.9 33.9 33.9 33.9 33.9 ...
$ long : num [1:157000] 67.7 67.7 67.7 67.7 67.7 ...
$ type : chr [1:157000] "confirmed" "confirmed" "confirmed" "confirmed" ...
$ cases : num [1:157000] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "problems")= tibble [45,800 × 5] (S3: tbl_df/tbl/data.frame)
..$ row : int [1:45800] 37001 37002 37003 37004 37005 37006 37007 37008 37009 37010 ...
..$ col : chr [1:45800] "province" "province" "province" "province" ...
..$ expected: chr [1:45800] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
..$ actual : chr [1:45800] "Alberta" "Alberta" "Alberta" "Alberta" ...
..$ file : chr [1:45800] "'./data/coronavirus.csv'" "'./data/coronavirus.csv'" "'./data/coronavirus.csv'" "'./data/coronavirus.csv'" ...
- attr(*, "spec")=
.. cols(
.. date = col_date(format = ""),
.. province = col_logical(),
.. country = col_character(),
.. lat = col_double(),
.. long = col_double(),
.. type = col_character(),
.. cases = col_double()
.. )
# # See the result
bycountry <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
arrange(-total_cases)
`summarise()` ungrouping output (override with `.groups` argument)
bycountry
confirmed_cases = coronavirus %>%
group_by(date) %>%
filter(type == "confirmed") %>%
summarize(cases = sum(cases)) %>%
mutate(cumsum = cumsum(cases))
`summarise()` ungrouping output (override with `.groups` argument)
# confirmed_cases_china = coronavirus %>%
# group_by(date) %>%
# filter(country == "China", type == "confirmed")%>%
# summarize(cases = sum(cases)) %>%
# mutate(cumsum = cumsum(cases))
# confirmed_cases_china
(confirmed_cases)
NA
NA
NA
NA
NA
coronavirus %>%
filter(date == max(date)) %>%
select(country, type, cases) %>%
group_by(country, type) %>%
summarise(total_cases = sum(cases)) %>%
pivot_wider(names_from = type,
values_from = total_cases) %>%
arrange(-confirmed)
`summarise()` regrouping output by 'country' (override with `.groups` argument)
coronavirus %>%
group_by(type, date) %>%
summarise(total_cases = sum(cases)) %>%
pivot_wider(names_from = type, values_from = total_cases) %>%
arrange(date) %>%
mutate(active = confirmed - death - recovered) %>%
mutate(active_total = cumsum(active),
recovered_total = cumsum(recovered),
death_total = cumsum(death)) %>%
plot_ly(x = ~ date,
y = ~ active_total,
name = 'Active',
fillcolor = '#1f77b4',
type = 'scatter',
mode = 'none',
stackgroup = 'one') %>%
add_trace(y = ~ death_total,
name = "Death",
fillcolor = '#E41317') %>%
add_trace(y = ~recovered_total,
name = 'Recovered',
fillcolor = 'forestgreen') %>%
layout(title = "Distribution of Covid19 Cases Worldwide",
legend = list(x = 0.1, y = 0.9),
yaxis = list(title = "Number of Cases"),
xaxis = list(title = "Source: Johns Hopkins University Center for Systems Science and Engineering"))
`summarise()` regrouping output by 'type' (override with `.groups` argument)
`arrange_()` is deprecated as of dplyr 0.7.0.
Please use `arrange()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
conf_df <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
arrange(-total_cases) %>%
mutate(parents = "Confirmed") %>%
ungroup()
`summarise()` ungrouping output (override with `.groups` argument)
plot_ly(data = conf_df,
type= "treemap",
values = ~total_cases,
labels= ~ country,
parents= ~parents,
domain = list(column=0),
name = "Confirmed",
textinfo="label+value+percent parent")
ggplot(confirmed_cases, aes(x = date, y = cum_cases)) +
geom_line(aes(x = date, y = cum_cases)) +
ylab("Cumulative confirmed cases")
Error in FUN(X[[i]], ...) : object 'cum_cases' not found
head(coronavirus)
coronavirus = tibble::rowid_to_column(coronavirus, "ID")
head(coronavirus)
# coronavirus = coronavirus %>%
# group_by(ID) %>%
# mutate(cum_cases = cumsum(cases))
# coronavirus %>% head(70)
mutate(group_by(coronavirus, ID), cumsum = cumsum(cases))
# df <- data.frame(id = rep(1:3, each = 5),
# hour = rep(1:5, 3),
# value = sample(1:15))
#
# mutate(group_by(df,id), cumsum=cumsum(value))
confirmed_cases_china = coronavirus %>%
group_by(date) %>%
filter(country == "China", type == "confirmed")%>%
summarize(cases = sum(cases)) %>%
mutate(cumsum = cumsum(cases))
`summarise()` ungrouping output (override with `.groups` argument)
confirmed_cases_china
confirmed_cases_china %>%
group_by(date) %>%
summarize(cases = sum(cases)) %>%
select(date, country, cases, cumsum)
`summarise()` ungrouping output (override with `.groups` argument)
Error: Can't subset columns that don't exist.
x Column `country` doesn't exist.
plt_cum_confirmed_cases_china <- ggplot(confirmed_cases_china, aes(date, cumsum)) +
geom_line() +
ylab("Cumulative confirmed cases")
# See the plot
plt_cum_confirmed_cases_china
who_events <- tribble(
~ date, ~ event,
"2020-01-30", "Global health emergency declared",
"2020-03-11", "Pandemic declared",
"2020-02-13", "China reporting change"
) %>%
mutate(date = as.Date(date))
# Using who_events, add vertical dashed lines with an xintercept at date
# and text at date, labeled by event, and at 100000 on the y-axis
plt_cum_confirmed_cases_china +
geom_vline(aes(xintercept = date), data = who_events, linetype = "dashed") +
geom_text(aes(x = date, label = event), data = who_events, y = 1e5)
# Filter for China, from Feb 15
china_after_feb15 <- confirmed_cases_china %>%
filter(date >= "2020-02-15")
# Using china_after_feb15, draw a line plot cum_cases vs. date
# Add a smooth trend line using linear regression, no error bars
ggplot(china_after_feb15, aes(date, cumsum)) +
geom_line() +
geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE) +
ylab("Cumulative confirmed cases")
not_china = coronavirus %>%
group_by(date) %>%
filter(country != "China", type == "confirmed")%>%
summarize(cases = sum(cases)) %>%
mutate(cumsum = cumsum(cases))
`summarise()` ungrouping output (override with `.groups` argument)
not_china
# not_china %>%
# group_by(date) %>%
# summarize(cases = sum(cases)) %>%
# select(date, country, cases, cumsum)
glimpse(not_china)
Rows: 200
Columns: 3
$ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26, 2020-01-27, 2…
$ cases <dbl> 7, 4, 10, 7, 15, 7, 19, 10, 14, 32, 22, 10, 14, 20, 12, 12, 70, 30, 15, 84…
$ cumsum <dbl> 7, 11, 21, 28, 43, 50, 69, 79, 93, 125, 147, 157, 171, 191, 203, 215, 285,…
not_china2 = coronavirus %>%
group_by(date) %>%
filter(country != "China", type == "confirmed")%>%
summarize(cases = sum(cases)) %>%
mutate(cumsum = cumsum(cases))
`summarise()` ungrouping output (override with `.groups` argument)
not_china2
world_after_feb15 <- all_countries %>%
filter(date >= "2020-02-15")
Error in eval(lhs, parent, parent) : object 'all_countries' not found
# Using not_china, draw a line plot cum_cases vs. date
# Add a smooth trend line using linear regression, no error bars
plt_not_china_trend_lin <- ggplot(not_china, aes(date, cumsum)) +
geom_line() +
geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE) +
ylab("Cumulative confirmed cases")
# See the result
plt_not_china_trend_lin
plt_not_china_trend_lin +
scale_y_log10()
Filter by top 7 countries
target = c("Brazil", "India", "Mexico", "Peru", "Russia", "South Africa", "US")
top_7countries = bycountry %>%
filter(country %in% target)
top_7countries
NA
ggplot(top_7countries, aes(date, total_cases)) +
geom_line(aes(group = country, color = country))+
ylab("Cumulative confirmed cases")